Construct Non-Hierarchical P/NBD Model for Online Retail Transaction Data
In this workbook we construct the non-hierarchical P/NBD models on the synthetic data with the longer timeframe.
1 Load and Construct Datasets
We start by modelling the P/NBD model using our synthetic datasets before we try to model real-life data.
Show code
use_fit_start_date <- as.Date("2009-12-01")
use_fit_end_date <- as.Date("2010-12-01")
use_valid_start_date <- as.Date("2010-12-01")
use_valid_end_date <- as.Date("2012-12-10")1.1 Load Online Retail Transaction Data
We now want to load the online retail transaction data.
Show code
customer_cohortdata_tbl <- read_rds("data/onlineretail_cohort_tbl.rds")
customer_cohortdata_tbl |> glimpse()Rows: 5,852
Columns: 5
$ customer_id <chr> "12346", "12347", "12348", "12349", "12350", "12351", …
$ cohort_qtr <chr> "2010 Q1", "2010 Q4", "2010 Q3", "2010 Q2", "2011 Q1",…
$ cohort_ym <chr> "2010 03", "2010 10", "2010 09", "2010 04", "2011 02",…
$ first_tnx_date <date> 2010-03-02, 2010-10-31, 2010-09-27, 2010-04-29, 2011-…
$ total_tnx_count <int> 3, 8, 5, 3, 1, 1, 9, 2, 1, 2, 6, 2, 5, 10, 6, 4, 10, 2…
Show code
customer_transactions_tbl <- read_rds("data/onlineretail_transactions_tbl.rds")
customer_transactions_tbl |> glimpse()Rows: 53,711
Columns: 4
$ tnx_timestamp <dttm> 2009-12-01 07:45:00, 2009-12-01 07:45:59, 2009-12-01 09…
$ invoice_id <chr> "489434", "489435", "489436", "489437", "489438", "48943…
$ customer_id <chr> "13085", "13085", "13078", "15362", "18102", "12682", "1…
$ tnx_amount <dbl> 505.30, 145.80, 630.33, 310.75, 2286.24, 426.30, 50.40, …
We re-produce the visualisation of the transaction times we used in previous workbooks.
Show code
plot_tbl <- customer_transactions_tbl |>
group_nest(customer_id, .key = "cust_data") |>
filter(map_int(cust_data, nrow) > 3) |>
slice_sample(n = 30) |>
unnest(cust_data)
ggplot(plot_tbl, aes(x = tnx_timestamp, y = customer_id)) +
geom_line() +
geom_point() +
labs(
x = "Date",
y = "Customer ID",
title = "Visualisation of Customer Transaction Times"
) +
theme(axis.text.y = element_text(size = 10))1.2 Construct Datasets
Having loaded the synthetic data we need to construct a number of datasets of derived values.
Show code
customer_summarystats_tbl <- customer_transactions_tbl |>
drop_na(customer_id) |>
calculate_transaction_cbs_data(last_date = use_fit_end_date |> as.POSIXct())
customer_summarystats_tbl |> glimpse()Rows: 4,336
Columns: 6
$ customer_id <chr> "12346", "12347", "12348", "12349", "12351", "12352", "…
$ first_tnx_date <dttm> 2009-12-14 08:34:00, 2010-10-31 14:19:59, 2010-09-27 1…
$ last_tnx_date <dttm> 2010-10-04 16:32:59, 2010-10-31 14:19:59, 2010-09-27 1…
$ x <dbl> 14, 0, 0, 3, 0, 1, 0, 0, 2, 1, 2, 7, 5, 2, 0, 0, 0, 2, …
$ t_x <dbl> 42.04751984, 0.00000000, 0.00000000, 46.83075397, 0.000…
$ T_cal <dbl> 50.2347222, 4.3432540, 9.1965278, 51.6379960, 0.1941468…
As before, we construct a number of subsets of the data for use later on with the modelling and create some data subsets.
Show code
customer_fit_stats_tbl <- customer_summarystats_tbl
customer_fit_stats_tbl |> glimpse()Rows: 4,336
Columns: 6
$ customer_id <chr> "12346", "12347", "12348", "12349", "12351", "12352", "…
$ first_tnx_date <dttm> 2009-12-14 08:34:00, 2010-10-31 14:19:59, 2010-09-27 1…
$ last_tnx_date <dttm> 2010-10-04 16:32:59, 2010-10-31 14:19:59, 2010-09-27 1…
$ x <dbl> 14, 0, 0, 3, 0, 1, 0, 0, 2, 1, 2, 7, 5, 2, 0, 0, 0, 2, …
$ t_x <dbl> 42.04751984, 0.00000000, 0.00000000, 46.83075397, 0.000…
$ T_cal <dbl> 50.2347222, 4.3432540, 9.1965278, 51.6379960, 0.1941468…
Show code
customer_valid_stats_tbl <- customer_transactions_tbl |>
drop_na(customer_id) |>
filter(
tnx_timestamp > (use_valid_start_date |> as.POSIXct())
) |>
summarise(
tnx_count = n(),
tnx_last_interval = difftime(
max(tnx_timestamp),
use_valid_start_date,
units = "weeks"
) |>
as.numeric(),
.by = customer_id
)
customer_valid_stats_tbl |> glimpse()Rows: 4,372
Columns: 3
$ customer_id <chr> "17850", "13047", "12583", "13748", "15100", "15291"…
$ tnx_count <int> 35, 18, 18, 5, 6, 20, 27, 15, 118, 86, 8, 1, 3, 76, …
$ tnx_last_interval <dbl> 10.22996032, 48.92956349, 53.04831349, 39.77232143, …
Show code
obs_fitdata_tbl <- customer_fit_stats_tbl |>
rename(tnx_count = x)
### We need to add all the zero count customers into the valid data
obs_validdata_tbl <- customer_fit_stats_tbl |>
anti_join(customer_valid_stats_tbl, by = "customer_id") |>
transmute(customer_id, tnx_count = 0) |>
bind_rows(customer_valid_stats_tbl) |>
arrange(customer_id)We then write this data to disk.
Show code
#! echo: TRUE
obs_fitdata_tbl |> write_rds("data/onlineretail_obs_fitdata_tbl.rds")
<<<<<<< Updated upstream
obs_validdata_tbl |> write_rds("data/onlineretail_obs_validdata_tbl.rds")